library(maps)

load('alldata.RData')
## reads in a object called 'alldat'

## For some reason the Arithmetic.Mean variable needs to be converted to numeric
alldat$Arithmetic.Mean = as.numeric(alldat$Arithmetic.Mean)

dat = alldat

## Select years
Years = 1990:2010
dat = subset(dat, Year %in% Years)

## Select only the variables that will be used
dat = dat[, names(dat) %in% c("FIPS", "Year", "Monitor", "Latitude", "Longitude", "Parameter.Name", "Sample.Duration",
                              "Metric.Used", "State.Name", "County.Name",  
                              "Arithmetic.Mean", "X98th.Percentile", "X95th.Percentile", "X90th.Percentile",                     
                              "mean_age", "Female_rate", "White_rate", "Black_rate", "Total_death", "Tot_Beneficiary", "pyear", "ALL_CVD",                       
                              "AMI", "COPD", "CV_stroke", "HF", "HRD", "IHD", "PVD", "RTI", "HRP", "Respiratory", "a",
                              "Median_income", "HS_rate", "Urban_rate", "Migration_5_year_rate", "Hispanic_cens", "White_cens", "Black_cens", 
                              "Other_cens", "Current_Smoking_rate_weighted", "HOUSEUNIT", "POPULATION", "Female_cens", "county_ID", 
                              "Dew_point_annual_mean", "Dew_point_annual_STD", "temperature_annual_mean", "temperature_annual_STD", 
                              "temperature_annual_max_mean", "temperature_annual_min_mean", "Weather_total_site")]


## Reshape
datwide = reshape(dat, direction = "wide", 
      v.names = c(  "mean_age", "Female_rate", "White_rate", "Black_rate", "Total_death", "Tot_Beneficiary", "pyear", "ALL_CVD",                       
                    "AMI", "COPD", "CV_stroke", "HF", "HRD", "IHD", "PVD", "RTI", "HRP", "Respiratory",
                    "Arithmetic.Mean", "X98th.Percentile", "X95th.Percentile", "X90th.Percentile", 
                     "Latitude", "Longitude", "Sample.Duration", "Metric.Used"), 
                    timevar = "Year", idvar = "Monitor")


### Need to select a single lat/long for each monitor.  If you just take one year, then latitude and longitude will be missing for any monitor
### that was not in the data for that year (e.g., if you take 1990 then you will miss a monitor that didn't start until 1999, and if you take 1999
### then you will miss a monitor that stopped operating in 1991).

getlast = function(x){
  return(x[max(which(!is.na(x)))])
}

datwide$Longitude = apply(datwide[, paste("Longitude.", Years, sep="")], 1, getlast)
datwide$Latitude = apply(datwide[, paste("Latitude.", Years, sep="")], 1, getlast)

datwide = datwide[, !(names(datwide) %in% paste("Longitude.", Years, sep=""))]
datwide = datwide[, !(names(datwide) %in% paste("Latitude.", Years, sep=""))]

#############################################
######     Calculate Variables          #####
#############################################

#Baseline and follow up PM 
makeavgvar=function(d,varprefix, years, whichdata){
tempmat=d[, names(datwide) %in% paste(varprefix,".", years,whichdata, sep="") ]
return(rowMeans(tempmat, na.rm=TRUE))
}

## Baseline PM10 - use 1990
datwide$pm10base1990 = datwide$Arithmetic.Mean.1990

## Baseline PM10 - use 1990 - 1994
datwide$pm10base1990_1994 = makeavgvar(d=datwide, varprefix="Arithmetic.Mean", years=c("1990", "1991", "1992", "1993", "1994"), "")

## Follow up PM10 - use average of 1999-2001
datwide$pm10fu = makeavgvar(d=datwide, varprefix="Arithmetic.Mean", years=c("1999", "2000", "2001"), "")

## Follow up PM10 - use average 95th %tile from 1999-2001
datwide$pm10fu95 = makeavgvar(d=datwide, varprefix="X95th.Percentile", years=c("1999", "2000", "2001"), "")

## Housing Density (from census)
datwide$housedens=datwide$HOUSEUNIT/datwide$POPULATION

datwide$loginc=log(datwide$Median_income)
datwide$logpop=log(datwide$POPULATION)
datwide$lhousedens = log(datwide$housedens)


###############################################
# Make Data Set for Imputing Missing Baseline #
###############################################
## Restrict to Western US
west = subset(datwide, Latitude > -130 & Longitude < -100 & Longitude > -130)

## Remove observations that have (missing baseline AND follow up PM) OR (missing covariates)
## Don't worry about Medicare data here because this data set will be used to impute pollution only (and does not require Medicare data)
west_restrict = subset(west, (!is.na(pm10base1990_1994) | !is.na(pm10fu)) & ( !is.na(Median_income) & !is.na(Current_Smoking_rate_weighted)))


## Use this for imputing baseline data
datforimpute = west_restrict
save(datforimpute, file = 'datforimpute.RData')

